In [1]:
import urllib2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn import metrics
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup

%matplotlib inline

In [2]:
def get_tag_name(tag_url):
    """Takes a Atlas Obscura db dump and turns the tag_url into bare tags."""
    return tag_url.split('/')[2]

def get_dummies_and_join(df,var_column):
    """gets dummies for a var_column in a dataframe and returns a dataframe with
    those dummies joined on."""
    df = df.join(pd.get_dummies(df[var_column]))
    return df

def unique_article_set(df,column):
    """Takes a DataFrame and a column to groupby on and returns the DF, using
    sum as the agfunc for the remaining columns. Typically this will be a
    list of articles containing duplicates with an array of dummy vars
    attached to it. Column is a string, df is a pandas DataFrame"""
    return df.groupby(column).sum()

##Skipped over the Time Series section. Should come later.

def append_atlas_url(prefix,column):
    """Adds a given prefix to items in a Series and returns a list with the
    prefix added to each item. Typically should be set equal to the input
    Series so as to assign the new list back to the DataFrame."""
    return [prefix + x for x in column]

def get_simplereach_url(x):
    if type(x) == str:
        return x.split('http://')[1]
    else:
        pass

def target_pageview_cutoff(pageviews,pv_column):
    """Creates a target vector of categorical values (1,0) based on whether a
    particular article is in excess of the PV cutoff."""
    return [1 if x > pageviews else 0 for x in pv_column]

def get_year(df,date_column):
    """Returns a Series of just the year based on a date column passed into the
    function."""
    return pd.DatetimeIndex(df[date_column]).year

def get_total_tagged(df,new_col_name):
    """Takes the articles and tags dataframe, sums the tag vectors, and Creates
    a new dataframe containing the tags and the number of articles tagged."""
    total_tagged = pd.DataFrame(data=df.sum(),columns=[new_col_name])
    total_tagged = total_tagged.sort_values('num_tagged',ascending=False).drop(
                                                        'pageviews',
                                                        axis=0,
                                                        )
    return total_tagged

def get_interaction_features(df,interaction_tags,total_tagged,tag_threshold):
    """Takes the articles and tags DataFrame and creates binary interaction
    features using a list passed as interaction_tags as the tags to be paired
    with the rest of the set. DataFrame must only contain index and
    categorical features. DataFrame will return interaction features only for
    tags having article counts above the threshold set as tag_threshold."""
    for item in regular_features:
        for column in df.drop(total_tagged[total_tagged.num_tagged < tag_threshold]).columns:
            interactions[(item + '_' + column)] = df[item] + df[column]
    return interactions

def correct_values(interactions_df):
    """Changes any values of 2 to 1 and 1s to 0s. This is because the dataframe
    of interactions is created in get_interaction_features() by adding two columns
    together to generate a composite column that represents an interaction
    feature. Where there's a 2, that article has both tags. This should be 1.
    Where there's a 1, the article only has 1 of the tags, and is falsey for that
    tag."""
    def two_to_one(x):
        if x == 2.0:
            return 1
        else:
            return 0
    for x in interactions_df.columns:
        interactions_df[x] = interactions_df[item].apply(two_to_one)
    return interactions_df

def drop_identity_tags(interactions_df):
    """Takes the interaction dataframe and drops the tags where the interaction_tags
    paired to one another."""
    tagged_total = get_total_tagged(interactions_df,'num_tagged')
    interactions_df = interactions_df.drop(tagged_total[0:26].index,axis=1)
    return interaction_df

def drop_zero_cols(df):
    """Takes a dataframe and drops any column that sums to zero. This is used to
    drop categorical variables that not a single datapoint are positive for. cols
    should only be categorical variables encoded (0,1)."""
    df = df.fillna(0)
    for item in df.columns:
        if df[item].sum() == 0:
            df = df.drop(item,axis=1)
        else:
            continue
    return df

def drop_tag_features(df,tag_threshold):
    """Takes a dataframe of articles and tag features and returns a subset of df
    that contains only the tags that have corresponding article counts above the
    tag_threshold."""
    total_tagged = get_total_tagged(df,'num_tagged')
    df2 = df.drop(total_tagged[total_tagged.num_tagged < tag_threshold].index,axis=1)
    return df2

def get_cross_validation_score(X,y,estimator,n_folds):
    """Dependent on sklearn cross_validation and numpy. Takes an X and y and returns the cross_validation accuracy score for a given linear_model.estimator."""
    kf = cross_validation.KFold(len(X),n_folds=n_folds)
    scores = []
    for train_index, test_index in kf:
        lr = estimator.fit(X.iloc[train_index],y.iloc[train_index])
        scores.append(lr.score(X.iloc[test_index],y.iloc[test_index]))
    return np.mean(scores)

def get_probabilities(model,X):
    """Takes an estimator model that has been fix and the X used in that model
    and returns the probabilities from predict_proba with the original feature
    names zipped back up with them."""
    interactions_model_scores = model.predict_proba(X)[:,1]
    interactions_probabilities = pd.DataFrame(zip(X.columns,interactions_model_scores),columns=['tags','probabilities'])
    interactions_probabilities = interactions_probabilities.sort_values('probabilities',ascending=False)
    return interactions_probabilities

def get_sub_tags(df):
    df = df.reset_index()
    df['subtag'] = df.tags.apply(lambda x: x.split('_')[1])
    return df

def get_pageviews(probabilities,df):
    """Sums the pageviews of all articles corresponding to a tag and returns the probabilities dataframe with a new column of total PVs for each tag."""
    probabilities['pageview'] = [sum(df['pageviews'][df[item] == 1]) for item in probabilities['tags']]
    return probabilities

def get_roc_scores(y,scores):
    return metrics.roc_auc_score(y,scores)

def get_article_tags(url):
    """Uses BeautifulSoup to scrape Atlas pages for our tags"""
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page, 'lxml')
    if "/articles/" in url:
        taglist = [elem.get_text() for elem in soup.select('div.item-tags a')]
    elif "/places" in url:
        taglist = [elem.get_text() for elem in soup.select('div.item-tags a')]
    else:
        taglist = ""
    for count,item in enumerate(taglist):
        taglist[count] = item.split('#')[1]
    return url, taglist

def transform_article_for_prediction(url,whole_set):
    url, taglist = get_article_tags(url)
    tags = {}
    for item in taglist:
        tags[item] = [1]
    article_tag_vectors = pd.DataFrame(tags)
    article_tag_vectors = article_tag_vectors.reindex(columns=whole_set.columns,
    fill_value=0)
    return article_tag_vectors

First we import the table of tag-article mappings from our SQL database

(but read in as a .csv).

In [3]:
df = pd.read_csv('atlas-taggings.csv')

In [4]:
df[2:5]


Out[4]:
tag_id tag_url tagged_type tagged_id tagged_url
2 2 www.atlasobscura.com/categories/panoramas Place 6431 www.atlasobscura.com/places/gettysburg-cyclorama
3 2 www.atlasobscura.com/categories/panoramas Article 2311 www.atlasobscura.com/articles/rip-gettysburg-c...
4 258 www.atlasobscura.com/categories/bridges Place 10134 www.atlasobscura.com/places/gimbel-s-bridge

We only care about the content type "Article"


In [5]:
articles = df[df.tagged_type == 'Article']

But we need to get the tag name out of the url string for the tag


In [6]:
articles.tag_url = articles.tag_url.apply(get_tag_name)


/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:2698: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

In [7]:
articles = get_dummies_and_join(articles,'tag_url')
articles = articles.drop(['tag_id','tag_url','tagged_type','tagged_id'],axis=1)

In [8]:
articles = unique_article_set(articles,'tagged_url')

In [9]:
articles = articles.reset_index().set_index('tagged_url')

Import the table of URLs and total pageviews


In [10]:
pageviews = pd.read_csv('output_articles_performance.csv',header=None,names=[
        'url','published','pageviews'
    ])

In [11]:
pageviews.url = ['www.atlasobscura.com/articles/' + x for x in pageviews.url]

In [12]:
pageviews.describe()


Out[12]:
pageviews
count 3446.000000
mean 7052.891759
std 23256.215270
min 1.000000
25% 1150.250000
50% 2571.500000
75% 5834.750000
max 621494.000000

In [13]:
pageviews.set_index('url',inplace=True)
article_set = articles.join(pageviews)

In [ ]:


In [14]:
article_set['ten_thousand'] = target_pageview_cutoff(10000,article_set.pageviews)

In [15]:
article_set['published'] = pd.to_datetime(article_set['published'])
article_set['year'] = get_year(article_set,'published')

In [54]:
article_set.pageviews.plot(kind='density',title='Page View Distribution, All Articles')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-54-fb377341b70c> in <module>()
----> 1 article_set.log(pageviews).plot(kind='density',title='Page View Distribution, All Articles')

/Users/Mike/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in __getattr__(self, name)
   2667             if name in self._info_axis:
   2668                 return self[name]
-> 2669             return object.__getattribute__(self, name)
   2670 
   2671     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'log'

In [17]:
ax = article_set.boxplot(column='pageviews',by='year',figsize=(6,6),showfliers=False)
ax.set(title='PV distribution by year of publication, no outliers',ylabel='pageviews')


Out[17]:
[<matplotlib.text.Text at 0x119e4ac50>, <matplotlib.text.Text at 0x1062cc950>]

In [18]:
sns.factorplot(
    x='year',
    y='ten_thousand',
    data = article_set
    )


Out[18]:
<seaborn.axisgrid.FacetGrid at 0x11aab0e90>

In [19]:
total_tagged = get_total_tagged(article_set,'num_tagged')

In [ ]:


In [20]:
article_set.fillna(value=0,inplace=True)

In [21]:
y = article_set.ten_thousand
X = article_set.drop(['pageviews','published','ten_thousand'],axis=1)

In [22]:
cross_val_score = get_cross_validation_score(X,y,linear_model.LogisticRegression(penalty = 'l1'),
                                            n_folds=5)

In [23]:
lr = linear_model.LogisticRegression(penalty = 'l1').fit(X,y)

In [24]:
lr_scores = lr.predict_proba(X)[:,1]
roc_score = get_roc_scores(y,lr_scores)
print roc_score


0.797464130142

In [25]:
single_tag_probabilities = get_probabilities(lr,X)

Now we will explore how a KNN classifier does with our dataset


In [26]:
params = {'n_neighbors' : [x for x in range(2,100,4)],
          'weights' : ['distance','uniform']}
gs = GridSearchCV(estimator = KNeighborsClassifier(),param_grid=params,
                  n_jobs=-1,cv=10,verbose=1)
gs.fit(X,y)
print gs.best_params_
print gs.best_score_

knn = gs.best_estimator_.fit(X,y)
knn_probs = get_probabilities(knn,X)


Fitting 10 folds for each of 50 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.2min finished
{'n_neighbors': 30, 'weights': 'uniform'}
0.848560255954

In [27]:
knn_cross_val_score = get_cross_validation_score(X,y,knn,5)
knn_scores = knn.predict_proba(X)[:,1]
knn_roc_score = get_roc_scores(y,knn_scores)

In [55]:
params_rfc = {'max_depth': np.arange(20,100,5),
             'min_samples_leaf': np.arange(90,200,5),
             'n_estimators': [20],
             'criterion' : ['gini','entropy']
                }
gs1 = GridSearchCV(RandomForestClassifier(),param_grid=params_rfc, cv=10, scoring='roc_auc',n_jobs=-1,verbose=1)
gs1.fit(X,y)
print gs1.best_params_
print gs1.best_score_


Fitting 10 folds for each of 704 candidates, totalling 7040 fits
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   50.8s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 7040 out of 7040 | elapsed:  3.1min finished
{'n_estimators': 20, 'criterion': 'gini', 'max_depth': 85, 'min_samples_leaf': 100}
0.631050275454

Now we will explore how a RandomForest classifier does with our dataset


In [29]:
rf = gs1.best_estimator_
rf.fit(X,y)

rf_cross_val_score = get_cross_validation_score(X,y,rf,5)
rf_scores = rf.predict_proba(X)[:,1]
rf_roc_score = get_roc_scores(y,rf_scores)

In [30]:
print "Logistic Regression Cross-validation Score: ", cross_val_score
print "K Nearest Neighbors Cross-validation Score: ", knn_cross_val_score
print "RandomForest Cross-validation Score: ", rf_cross_val_score

print "Logistic Regressions ROC AUC score: ", roc_score
print "K Nearest Neighbors ROC AUC score: ", knn_roc_score
print "RandomForest ROC AUC score: ", rf_roc_score


Logistic Regression Cross-validation Score:  0.8499573333
K Nearest Neighbors Cross-validation Score:  0.848535109954
RandomForest Cross-validation Score:  0.848179238068
Logistic Regressions ROC AUC score:  0.797464130142
K Nearest Neighbors ROC AUC score:  0.714784820116
RandomForest ROC AUC score:  0.598521135193

Prediction of a given URL


In [31]:
url, taglist = get_article_tags('http://www.atlasobscura.com/articles/the-ao-exit-interview-12-years-in-the-blue-man-group')

In [32]:
transformed_article = transform_article_for_prediction(url,article_set)

In [ ]:

Refining the model

We will reproduce our results looking only at articles published in 2015 and 2016


In [33]:
article_set.head(1)


Out[33]:
100-wonders 19th-century 2016-election 30-rock 31-days-of-halloween abandoned abandoned-amusement-parks abandoned-brooklyn abandoned-cemetaries abandoned-hospitals ... wwi wwii yehlui-geological-park yeti zombies zoos published pageviews ten_thousand year
tagged_url
www.atlasobscura.com/articles/10-little-known-beaches-to-explore-in-the-last-days-of-summer 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 2015-08-01 651.0 0 2015.0

1 rows × 987 columns


In [34]:
y1 = article_set[article_set.year >= 2016].ten_thousand
X1 = article_set[article_set.year >= 2016].drop(['pageviews','published','ten_thousand'],axis=1)

In [35]:
cross_val_score1 = get_cross_validation_score(X1,y1,linear_model.LogisticRegression(penalty = 'l1'),
                                            n_folds=5)
lr1 = linear_model.LogisticRegression(penalty = 'l1').fit(X1,y1)
lr_scores1 = lr1.predict_proba(X1)[:,1]
roc_score1 = get_roc_scores(y1,lr_scores1)
print roc_score1


0.82455608751

Now we will rebuild our model to have it predict if an article will receive over 500 Facebook shares.


In [36]:
simplereach = pd.read_csv('~/Downloads/all-content-simplereach.csv')

In [37]:
simplereach.Url = simplereach.Url.apply(get_simplereach_url)
simplereach = simplereach.set_index('Url')
simplereach = simplereach[['Avg Engaged Time','Social Actions','Facebook Shares','FaceBook Referrals']]

In [38]:
article_set2 = article_set.join(simplereach['Facebook Shares'])

In [39]:
article_set2['five_hundred_shares'] = target_pageview_cutoff(500,article_set2['Facebook Shares'])

Logistic Regression with FB Shares > 500 as target


In [48]:
y2 = article_set2.five_hundred_shares
X2 = article_set2.drop(['pageviews',
                        'published',
                        'ten_thousand',
                        'Facebook Shares',
                        'five_hundred_shares'
                           ],axis=1)

cross_val_score_social = get_cross_validation_score(X2,y2,linear_model.LogisticRegression(penalty = 'l1'),
                                            n_folds=5)
lr_social = linear_model.LogisticRegression(penalty = 'l1').fit(X2,y2)
lr_scores_social = lr_social.predict_proba(X2)[:,1]
roc_score_social = get_roc_scores(y2,lr_scores_social)
print "Cross-val score when predicting Facebook shares > 500: ", cross_val_score_social
print "ROC AUC score when predicting Facebook shares > 500: ",roc_score_social


 Cross-val score when predicting Facebook shares > 500:  0.907193289634
ROC AUC score when predicting Facebook shares > 500:  0.836639925702

In [41]:
url = 'http://www.atlasobscura.com/articles/winters-effigies-the-deviant-history-of-the-snowman'
lr2.predict(transform_article_for_prediction(url,X2))


Out[41]:
array([0])

KNN with FB Shares > 500 as target


In [50]:
params_social = {'n_neighbors' : [x for x in range(2,100,4)],
          'weights' : ['distance','uniform']}
gs_social = GridSearchCV(estimator = KNeighborsClassifier(),param_grid=params,
                  n_jobs=-1,cv=10,verbose=1)
gs_social.fit(X2,y2)
print gs_social.best_params_
print gs_social.best_score_

knn_social = gs_social.best_estimator_.fit(X2,y2)
knn_probs_social = get_probabilities(knn_social,X2)
knn_cross_val_score_social = get_cross_validation_score(X2,y2,knn_social,5)
knn_scores_social = knn_social.predict_proba(X2)[:,1]
knn_roc_score_social = get_roc_scores(y2,knn_scores_social)


Fitting 10 folds for each of 50 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.0min finished
 {'n_neighbors': 14, 'weights': 'uniform'}
0.91361535727

RandomForest with FB Shares > 500 as target


In [43]:
params_rfc = {'max_depth': np.arange(20,100,5),
             'min_samples_leaf': np.arange(90,200,5),
             'n_estimators': [20]}
gs1_social = GridSearchCV(RandomForestClassifier(),param_grid=params_rfc, cv=10, scoring='roc_auc',n_jobs=-1,verbose=1)
gs1_social.fit(X2,y2)

rf_social = gs1_social.best_estimator_
rf_social.fit(X2,y2)
rf_cross_val_score_social = get_cross_validation_score(X2,y2,rf_social,5)
rf_scores_social = rf_social.predict_proba(X)[:,1]
rf_roc_score_social = get_roc_scores(y2,rf_scores_social)


Fitting 10 folds for each of 352 candidates, totalling 3520 fits
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 3520 out of 3520 | elapsed:  1.5min finished

In [52]:
print gs1_social.best_params_
print gs1_social.best_score_


{'n_estimators': 20, 'max_depth': 25, 'min_samples_leaf': 115}
0.6579705052

In [53]:
print "Logistic Regression Cross-validation Score: ", cross_val_score_social
print "K Nearest Neighbors Cross-validation Score: ", knn_cross_val_score_social
print "RandomForest Cross-validation Score: ", rf_cross_val_score_social

print "Logistic Regressions ROC AUC score: ", roc_score_social
print "K Nearest Neighbors ROC AUC score: ", knn_roc_score_social
print "RandomForest ROC AUC score: ", rf_roc_score_social


Logistic Regression Cross-validation Score:  0.907193289634
K Nearest Neighbors Cross-validation Score:  0.913595823088
RandomForest Cross-validation Score:  0.913595823088
Logistic Regressions ROC AUC score:  0.836639925702
K Nearest Neighbors ROC AUC score:  0.786659140766
RandomForest ROC AUC score:  0.562823653745

In [45]:
np.mean(y)


Out[45]:
0.15179523640241735

In [46]:
simplereach.describe()


Out[46]:
Avg Engaged Time Social Actions Facebook Shares FaceBook Referrals
count 14898.000000 14898.000000 14898.000000 14898.000000
mean 88.869915 391.969929 61.045778 893.643845
std 57.559737 2632.323342 444.649227 7870.629805
min 5.000000 -19.000000 -1.000000 0.000000
25% 62.380740 1.000000 0.000000 0.000000
50% 77.141359 11.000000 2.000000 5.000000
75% 97.413157 119.000000 14.000000 105.000000
max 1360.000000 201271.000000 37480.000000 453648.000000

In [ ]: